A Quick Demo of Parallel Processing in R

R will run in parallel. A number of packages help with this. Some require “forking” which Windows will not support.

  p_load(parallel)
  p_load(MASS)
  starts <- rep(100,4)
  fx <- function(nstart) kmeans(Boston, 4, nstart=nstart)
  numCores <- detectCores()
  numCores

## [1] 4

  microbenchmark(lapply(starts, fx))

## Unit: milliseconds
##                expr      min       lq     mean   median       uq      max
##  lapply(starts, fx) 128.7929 131.4228 135.5263 133.6529 136.6032 160.8576
##  neval
##    100

  microbenchmark(lapply(starts, fx), mclapply(starts, fx, mc.cores=numCores))

## Error in mclapply(starts, fx, mc.cores = numCores): 'mc.cores' > 1 is not supported on Windows

  microbenchmark(results1 <- lapply(starts, fx),
                 results2 <- mclapply(starts, fx, mc.cores=numCores)
                )

## Error in mclapply(starts, fx, mc.cores = numCores): 'mc.cores' > 1 is not supported on Windows

  microbenchmark(lapply(starts, fx), mclapply(starts, fx))

## Unit: milliseconds
##                  expr      min       lq     mean   median       uq
##    lapply(starts, fx) 128.4495 132.5929 138.4325 135.1849 139.1556
##  mclapply(starts, fx) 128.2621 132.5228 137.3198 134.5668 138.8637
##       max neval cld
##  250.1233   100   a
##  229.9955   100   a

  microbenchmark(results1 <- lapply(starts, fx), 
                 results2 <- mclapply(starts, fx)
                )

## Unit: milliseconds
##                              expr      min       lq     mean   median
##    results1 <- lapply(starts, fx) 128.1570 132.1163 136.8817 134.6078
##  results2 <- mclapply(starts, fx) 127.7875 132.7025 137.8772 134.9077
##        uq      max neval cld
##  139.2817 178.2303   100   a
##  138.3704 232.4874   100   a

We can use the doParallel package with the foreach package to improve looping.

First some setup.

  p_load(foreach)
  p_load(doParallel)
  registerDoParallel(2)
  getDoParWorkers()

## [1] 2

  getDoParName()

## [1] "doParallelSNOW"

  getDoParVersion()

## [1] "1.0.14"

Now carry out some simple calculations to show how to use parallel processing.

  microbenchmark(
    for (i in 1:10){
      sqrt(i)
    },
    foreach(i=1:10) %do% {
      sqrt(i)
    },
    foreach (i=1:10) %dopar% {
      sqrt(i)
    },
    foreach (i=1:10, .combine=c) %dopar% {
      sqrt(i)
    }
  )

## Unit: milliseconds
##                                                     expr    min       lq
##                          for (i in 1:10) {     sqrt(i) } 1.7048  1.90695
##                   foreach(i = 1:10) %do% {     sqrt(i) } 4.8721  5.28460
##                foreach(i = 1:10) %dopar% {     sqrt(i) } 9.3667 10.02070
##  foreach(i = 1:10, .combine = c) %dopar% {     sqrt(i) } 9.2575  9.93580
##       mean   median       uq     max neval cld
##   2.074626  2.00415  2.15235  3.7961   100 a  
##   5.775400  5.53715  5.83250 11.6193   100  b 
##  10.713187 10.36680 10.81185 19.9433   100   c
##  11.099086 10.21875 10.75500 50.4359   100   c

Note that the use of parallel processing in this case is not helping.

Try some bootstrapping to check performance on a “real” problem.

 x <- iris[which(iris[,5] != "setosa"), c(1,5)] 
 trials <- 10000 
 
 microbenchmark( 
    ### Parallel (%dopar%)
    bs1 <- foreach(icount(trials), .combine=cbind) %dopar% { 
                  ind <- sample(100, 100, replace=TRUE) 
                  result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit)) 
                  coefficients(result1) 
    },
    ### Not parallel (%do%)
    bs2 <- foreach(icount(trials), .combine=cbind) %do% { 
                  ind <- sample(100, 100, replace=TRUE) 
                  result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit)) 
                  coefficients(result1) 
                  } 
 )

## Unit: seconds
##                                                                                                                                                                                                    expr
##  bs1 <- foreach(icount(trials), .combine = cbind) %dopar% {     ind <- sample(100, 100, replace = TRUE)     result1 <- glm(x[ind, 2] ~ x[ind, 1], family = binomial(logit))     coefficients(result1) }
##     bs2 <- foreach(icount(trials), .combine = cbind) %do% {     ind <- sample(100, 100, replace = TRUE)     result1 <- glm(x[ind, 2] ~ x[ind, 1], family = binomial(logit))     coefficients(result1) }
##       min       lq     mean   median       uq      max neval cld
##  20.62518 25.93794 28.02471 28.67176 30.53672 36.54693   100  a 
##  28.26366 36.22772 38.81467 38.58027 42.66944 50.61312   100   b

 ### Look at the bootstrap since we ran it
 bs1 <- t(bs1)
 bs2 <- t(bs2)
 
 apply(bs1,2,mean)

## (Intercept)   x[ind, 1] 
##  -13.112649    2.099639

 apply(bs1,2,var)

## (Intercept)   x[ind, 1] 
##   9.5801903   0.2450442

 apply(bs1,2,hist)

## $`(Intercept)`
## $breaks
##  [1] -34 -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10  -8  -6  -4  -2
## 
## $counts
##  [1]    1    0    1    8   13   53  166  426  988 1911 2527 2423 1185  272
## [15]   24    2
## 
## $density
##  [1] 0.00005 0.00000 0.00005 0.00040 0.00065 0.00265 0.00830 0.02130
##  [9] 0.04940 0.09555 0.12635 0.12115 0.05925 0.01360 0.00120 0.00010
## 
## $mids
##  [1] -33 -31 -29 -27 -25 -23 -21 -19 -17 -15 -13 -11  -9  -7  -5  -3
## 
## $xname
## [1] "newX[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $`x[ind, 1]`
## $breaks
##  [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
## 
## $counts
##  [1]    1   28  950 3601 3466 1470  393   77   12    1    1
## 
## $density
##  [1] 0.0002 0.0056 0.1900 0.7202 0.6932 0.2940 0.0786 0.0154 0.0024 0.0002
## [11] 0.0002
## 
## $mids
##  [1] 0.25 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25
## 
## $xname
## [1] "newX[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

 apply(bs2,2,mean)

## (Intercept)   x[ind, 1] 
##  -13.172504    2.110227

 apply(bs2,2,var)

## (Intercept)   x[ind, 1] 
##   10.212764    0.261791

 apply(bs2,2,hist)

## $`(Intercept)`
## $breaks
##  [1] -40 -38 -36 -34 -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10  -8
## [18]  -6  -4  -2
## 
## $counts
##  [1]    1    0    0    0    0    2   12   19   73  195  466  942 1855 2581
## [15] 2375 1166  278   34    1
## 
## $density
##  [1] 0.00005 0.00000 0.00000 0.00000 0.00000 0.00010 0.00060 0.00095
##  [9] 0.00365 0.00975 0.02330 0.04710 0.09275 0.12905 0.11875 0.05830
## [17] 0.01390 0.00170 0.00005
## 
## $mids
##  [1] -39 -37 -35 -33 -31 -29 -27 -25 -23 -21 -19 -17 -15 -13 -11  -9  -7
## [18]  -5  -3
## 
## $xname
## [1] "newX[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $`x[ind, 1]`
## $breaks
##  [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5
## 
## $counts
##  [1]   43  906 3567 3483 1454  430   92   22    2    0    0    1
## 
## $density
##  [1] 0.0086 0.1812 0.7134 0.6966 0.2908 0.0860 0.0184 0.0044 0.0004 0.0000
## [11] 0.0000 0.0002
## 
## $mids
##  [1] 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25 5.75 6.25
## 
## $xname
## [1] "newX[, i]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

A Quick Demo of Parallel Processing in R

Oliver de Pug

SPring 2019